!> \file
!! Main program for the LDG-H method; includes fractures
!!
!! \n
!!
!! This program is similar to ldgh.f90, except that it also includes
!! fractures in the domain.
!<
program ldghf_main

 use mod_utils, only: &
   t_realtime, my_second

 use mod_messages, only: &
   mod_messages_constructor, &
   mod_messages_destructor,  &
   mod_messages_initialized, &
   error,   &
   warning, &
   info

 use mod_kinds, only: &
   mod_kinds_constructor, &
   mod_kinds_destructor,  &
   mod_kinds_initialized, &
   wp

 use mod_fu_manager, only: &
   mod_fu_manager_constructor, &
   mod_fu_manager_destructor,  &
   mod_fu_manager_initialized, &
   new_file_unit

 !$ use mod_omp_utils, only: &
 !$   mod_omp_utils_constructor, &
 !$   mod_omp_utils_destructor,  &
 !$   mod_omp_utils_initialized, &
 !$   detailed_timing_omp, &
 !$   omput_push_key,      &
 !$   omput_pop_key,       &
 !$   omput_start_timer,   &
 !$   omput_close_timer,   &
 !$   omput_write_time

 use mod_octave_io, only: &
   mod_octave_io_constructor, &
   mod_octave_io_destructor,  &
   mod_octave_io_initialized, &
   write_octave, &
   read_octave

 use mod_mpi_utils, only: &
   mod_mpi_utils_constructor, &
   mod_mpi_utils_destructor,  &
   mod_mpi_utils_initialized, &
   mpi_comm_world, &
   mpi_init, mpi_finalize, &
   mpi_comm_rank

 use mod_sympoly, only: &
   mod_sympoly_constructor, &
   mod_sympoly_destructor,  &
   mod_sympoly_initialized, &
   show

 use mod_octave_io_sympoly, only: &
   mod_octave_io_sympoly_constructor, &
   mod_octave_io_sympoly_destructor,  &
   mod_octave_io_sympoly_initialized, &
   write_octave

 use mod_linal, only: &
   mod_linal_constructor, &
   mod_linal_destructor,  &
   mod_linal_initialized, &
   invmat_nop, &
   invmat_chol

 use mod_perms, only: &
   mod_perms_constructor, &
   mod_perms_destructor,  &
   mod_perms_initialized

 use mod_octave_io_perms, only: &
   mod_octave_io_perms_constructor, &
   mod_octave_io_perms_destructor,  &
   mod_octave_io_perms_initialized

 use mod_sparse, only: &
   mod_sparse_constructor, &
   mod_sparse_destructor,  &
   mod_sparse_initialized, &
   ! sparse types
   t_col,       &
   t_tri,       &
   ! construction of new objects
   new_col,     &
   new_tri,     &
   ! convertions
   col2tri,     &
   tri2col,     &
   ! overloaded operators
   operator(+), &
   operator(*), &
   sum,         &
   transpose,   &
   matmul,      &
   clear

 use mod_octave_io_sparse, only: &
   mod_octave_io_sparse_constructor, &
   mod_octave_io_sparse_destructor,  &
   mod_octave_io_sparse_initialized, &
   write_octave

 use mod_linsolver, only: &
   mod_linsolver_constructor, &
   mod_linsolver_destructor,  &
   !1) Parameters which should be set before using the solvers:
   t_nsolver_general_parms, linsolver_general_parms, &
   t_nsolver_umfpack_parms, linsolver_umfpack_parms, &
   t_nsolver_mumps_parms,   linsolver_mumps_parms,   &
   t_nsolver_itersol_parms, linsolver_itersol_parms, &
   !2) Main module functions
   factor, solve, clean

 use mod_output_control, only: &
   mod_output_control_constructor, &
   mod_output_control_destructor,  &
   mod_output_control_initialized, &
   elapsed_format, &
   base_name

 use mod_master_el, only: &
   mod_master_el_constructor, &
   mod_master_el_destructor,  &
   mod_master_el_initialized

 use mod_grid, only: &
   mod_grid_constructor, &
   mod_grid_destructor,  &
   mod_grid_initialized, &
   t_grid,               &
   new_grid, clear,      &
   new_subgrid,          &
   affmap,               &
   write_octave
 
 use mod_numquad, only: &
   mod_numquad_constructor, &
   mod_numquad_destructor,  &
   mod_numquad_initialized

 use mod_base, only: &
   mod_base_constructor, &
   mod_base_destructor,  &
   mod_base_initialized, &
   t_base, clear, &
   write_octave

 use mod_bcs, only: &
   mod_bcs_constructor, &
   mod_bcs_destructor,  &
   mod_bcs_initialized, &
   b_dir,                 &
   t_bcs, t_b_s, p_t_b_s, &
   new_bcs, clear

 use mod_fe_spaces, only: &
   mod_fe_spaces_constructor, &
   mod_fe_spaces_destructor,  &
   mod_fe_spaces_initialized, &
   rt,      & ! complete f.e. space RT
   ldgh,    & ! complete f.e. space LDG-H
   bdm        ! complete f.e. space BDM

 use mod_testcases, only: &
   mod_testcases_constructor, &
   mod_testcases_destructor,  &
   mod_testcases_initialized, &
   test_name,   &
   test_description,&
   coeff_dir,   &
   coeff_xiadv, &
   coeff_lam,   &
   coeff_q, coeff_divq

 use mod_tau, only: &
   mod_tau_constructor, &
   mod_tau_destructor,  &
   mod_tau_initialized, &
   taus

 use mod_ldgh_locmat, only: &
   mod_ldgh_locmat_constructor, &
   mod_ldgh_locmat_destructor,  &
   mod_ldgh_locmat_initialized, &
   ldgh_locmat, t_bcs_error

 use mod_ldg_locmat, only: &
   mod_ldg_locmat_constructor, &
   mod_ldg_locmat_destructor,  &
   mod_ldg_locmat_initialized, &
   ldg_locmat_e, ldg_locmat_s, &
   ldg_locmat_bs, &
   t_sub_bcs_error => t_bcs_error

! use mod_ldgh_reconstructions, only: &
!   mod_ldgh_reconstructions_constructor, &
!   mod_ldgh_reconstructions_destructor,  &
!   mod_ldgh_reconstructions_initialized, &
!   uuu, qqq, qhn, &
!   bases, uuus, uuusef, xiadv_el, &
!!   base_qsdg, qqqsdg,   &
!   base_qsrt, qqqsrt
!
! use mod_error_norms, only: &
!   mod_error_norms_constructor, &
!   mod_error_norms_destructor,  &
!   mod_error_norms_initialized, &
!   hybrid_err, &
!   primal_err, &
!   primal_err_sef, &
!   dual_err

!-----------------------------------------------------------------------
 
 implicit none

!-----------------------------------------------------------------------
 
 ! Parameters
 integer, parameter :: max_char_len = 1000
 character(len=*), parameter :: input_file_name = 'ldghf.in'
 character(len=*), parameter :: this_prog_name = 'ldghf'

 ! Main variables
 logical :: write_grid, write_sys, compute_error_norms, compute_usef
 integer :: k, stab_tauk, stab_bartauk, nbound, err_extra_deg
 integer, allocatable :: bc_type_t(:,:)
 character(len=max_char_len) :: method, grid_file, in_testname, &
   stab_method, in_basename, cbc_type
 type(t_base) :: base, sub_base
 type(t_grid), target :: grid, sub_grid
 integer, allocatable :: sub_v2v(:), sub_e2s(:)
 type(t_bcs), target :: bcs, sub_bcs

 ! Local matrixes
 integer :: dp, mk, pk, nk
 integer :: pos_shift_ie, pos_shift_isl, pos_shift_igdll, &
   pos_shift_jsl, pos
 integer :: ie, isl, igdll, is, i, il, jsl, jgdll, js, j, jl
 real(wp), allocatable :: &
   aak(:,:) , bbk(:,:) , cck(:,:) ,         &
   ddk(:,:) , eek(:,:) , hhk(:,:) , f2k(:), &
   ffk(:,:) , ggk(:,:) , llk(:,:) , f3k(:), &
  aaki(:,:) , wwk(:,:) , qqk(:,:) ,         &
  ddkaaki(:,:) , vvk(:,:) , uuk(:,:) ,      &
  mmmk(:,:) , fffk(:)
 
 ! Global matrix
 integer :: sdim, nnz_mmm11
 integer, allocatable :: mmmi(:), mmmj(:)
 real(wp), allocatable :: mmmx(:), fff(:), fff1(:), fff2(:), &
  lam(:), lam1(:), rhs(:)
 type(t_col) :: mmm, mmm11, mmm12, mmm21, mmm22

 ! Fracture specific matrices
 integer :: je, pos_shift_is, ii, jj
 real(wp), allocatable :: f1k(:)
 integer, allocatable :: f_mai(:), f_maj(:), f_mbi(:), f_mbj(:), &
   f_mci(:), f_mcj(:), f_mdi(:), f_mdj(:), dofs_remap(:)
 real(wp), allocatable :: f_max(:), f_mbx(:), f_mcx(:), f_mdx(:), &
   f_1(:), f_2(:)
 type(t_col) :: f_ma, f_mb, f_mc, f_md
 type(t_tri) :: f_md_g

 ! bcs variables
 integer :: is_dir, is_int
 integer, allocatable :: dofs_dir(:), dofs_nat(:)
 real(wp), allocatable :: lam2(:), norm_e2(:)

 ! Linear system solver
 logical :: itsol_abstol
 integer :: mpi_id, umfpack_print_level, precon_ndiags
 real(wp) :: itsol_toll
 character(len=max_char_len) :: linsolver, itsolver, &
   preconditioner

 ! error computations
 real(wp) :: e_l2s, e_l2k, e_pl2s, e_pl2k, e_u_l2, e_us_l2, e_usef_l2, &
   e_q_l2, e_q_hdiv, e_qsdg_l2, e_qsdg_hdiv, e_qsrt_l2, e_qsrt_hdiv

 ! IO variables
 character(len=*), parameter :: out_file_nml_suff = '-nml.octxt'
 character(len=10000+len(out_file_nml_suff)) :: out_file_nml_name
 character(len=*), parameter :: out_file_grid_suff = '-grid.octxt'
 character(len=10000+len(out_file_grid_suff)) :: out_file_grid_name
 character(len=*), parameter :: out_file_base_suff = '-base.octxt'
 character(len=10000+len(out_file_base_suff)) :: out_file_base_name
 character(len=*), parameter :: out_file_results_suff = '-results.octxt'
 character(len=10000+len(out_file_results_suff)) :: out_file_results_name

 ! Auxiliary variables
 integer :: fu, ierr
 real(t_realtime) :: t00, t0, t1
 character(len=10*max_char_len) :: message(3)
 type(t_bcs_error) :: b_err
 type(t_sub_bcs_error) :: b_sub_err

 ! Input namelist
 namelist /input/ &
   ! test case
   in_testname, &
   ! output base name
   in_basename, &
   ! base
   k, method,   &
   ! grid
   write_grid,  &
   grid_file,   &
   ! stabilization
   stab_method, stab_tauk, stab_bartauk, &
   ! boundary conditions
   nbound, cbc_type, &
   ! linear system solver
   linsolver,   &
   ! UMFPACK control variables
   umfpack_print_level, &
   ! iterative solver control variables (used only if linsolver.eq."iterative")
   itsolver,                 &
   itsol_abstol, itsol_toll, &
   !  precon_ndiags can be larger than the size of the matrix, in
   !  which case the whole matrix is used as preconditioner
   preconditioner, precon_ndiags, &
   ! results
   write_sys,   &
   ! postprocessing
   compute_usef,&
   ! errors
   compute_error_norms, err_extra_deg

 !-----------------------------------------------------------------------
 ! Setup of all the MPI processes
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! Preliminary initializations
 t00 = my_second()
 call mod_messages_constructor()

 !$ if(detailed_timing_omp) call mod_omp_utils_constructor()

 call mod_kinds_constructor()

 call mpi_init(ierr)
 call mod_mpi_utils_constructor()
 call mpi_comm_rank(mpi_comm_world,mpi_id,ierr)

 call mod_fu_manager_constructor()

 call mod_octave_io_constructor()

 call mod_sympoly_constructor()
 call mod_octave_io_sympoly_constructor()

 call mod_linal_constructor()

 call mod_perms_constructor()
 call mod_octave_io_perms_constructor()

 call mod_sparse_constructor()
 call mod_octave_io_sparse_constructor()
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! Read input file
 call new_file_unit(fu,ierr)
 open(fu,file=trim(input_file_name), &
      status='old',action='read',form='formatted',iostat=ierr)
 read(fu,input)
 close(fu,iostat=ierr)
 ! echo the input namelist
 out_file_nml_name = trim(in_basename)//out_file_nml_suff
 open(fu,file=trim(out_file_nml_name), &
      status='replace',action='write',form='formatted',iostat=ierr)
 write(fu,input)
 close(fu,iostat=ierr)

 allocate(bc_type_t(nbound,2))
 read(cbc_type,*) bc_type_t ! transposed, for simplicity
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! We can now initialize the linsolver constructor on ALL the
 ! processes and set the solver parameters (in fact, such parameters
 ! will not change during the entire execution).
 call mod_linsolver_constructor(trim(linsolver))
 linsolver_general_parms%transposed_mat = .false.
 linsolver_umfpack_parms%umfpack_print_level = &
         umfpack_print_level
 linsolver_itersol_parms%itsolver       = itsolver
 linsolver_itersol_parms%itsol_toll     = itsol_toll
 linsolver_itersol_parms%itsol_abstol   = itsol_abstol
 linsolver_itersol_parms%preconditioner = preconditioner
 linsolver_itersol_parms%precon_ndiags  = precon_ndiags
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! End of the setup of all the MPI processes: from now on, only the
 ! master process executes the code, while the remaining processes are
 ! involved only in the solution of the linear system when the solver
 ! is mumps.
 mpi_if_1: if(mpi_id.eq.0) then
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! output control setup
 call mod_output_control_constructor(in_basename)
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! Construct the grid
 t0 = my_second()

 call mod_master_el_constructor()

 call mod_grid_constructor()
 call new_grid(grid,trim(grid_file))
 call new_subgrid(sub_grid,sub_v2v,sub_e2s,grid,trim(grid_file))

 ! write the octave output
 if(write_grid) then
   out_file_grid_name = trim(base_name)//out_file_grid_suff
   call new_file_unit(fu,ierr)
   open(fu,file=trim(out_file_grid_name), &
        status='replace',action='write',form='formatted',iostat=ierr)
    call write_octave(grid,'grid',fu)
    call write_octave(sub_grid,'sub_grid',fu)
    call write_octave(sub_v2v,'r','sub_v2v',fu)
    call write_octave(sub_e2s,'r','sub_e2s',fu)
   close(fu,iostat=ierr)
 endif

 t1 = my_second()
 write(message(1),elapsed_format) &
   'Completed grid construction: elapsed time ',t1-t0,'s.'
 call info(this_prog_name,'',message(1))
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! Define the finite element space
 t0 = my_second()

 call mod_numquad_constructor()

 call mod_base_constructor()

 call mod_fe_spaces_constructor()
 select case(trim(method))
  case('rt')
   base = rt(grid%me,k)
  case('ldgh')
   base = ldgh(grid%me,k)
  case('bdm')
   base = bdm(grid%me,k)
  case default
   write(message(1),'(A,A,A)') 'Unknown method "',trim(method),'".'
   call error(this_prog_name,'',message(1))
 end select
 sub_base = ldgh(sub_grid%me,k)

 ! write the octave output
 out_file_base_name = trim(base_name)//out_file_base_suff
 call new_file_unit(fu,ierr)
 open(fu,file=trim(out_file_base_name), &
      status='replace',action='write',form='formatted',iostat=ierr)
   call write_octave(base,    'base',    fu)
   call write_octave(sub_base,'sub_base',fu)
 close(fu,iostat=ierr)

 t1 = my_second()
 write(message(1),elapsed_format) &
   'Completed FE basis construction: elapsed time ',t1-t0,'s.'
 call info(this_prog_name,'',message(1))
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! Collect the information concerning the boundary conditions
 call mod_bcs_constructor()
 call new_bcs(bcs,    grid,    transpose(bc_type_t))
 call new_bcs(sub_bcs,sub_grid,transpose(bc_type_t))
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! Test case setup
 call mod_testcases_constructor(in_testname,grid%m)
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! Define the stabilization parameter tau
 !> \bug check the definition of the \f$\tau\f$ parameter
 call mod_tau_constructor(trim(stab_method),stab_tauk,stab_bartauk, &
                          grid,base,write_grid)
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! Assemble matrix and right hand side
 call mod_ldgh_locmat_constructor(base)
 call mod_ldg_locmat_constructor(sub_base)

 t0 = my_second()
 !$ if(detailed_timing_omp) then
 !$   call omput_push_key("LocalMatrices")
 !$   call omput_start_timer()
 !$ endif

 !$omp parallel private(dp , mk , pk , nk ,    &
 !$omp                  aak , bbk , cck ,      &
 !$omp                  ddk , eek , hhk , f2k, &
 !$omp                  ffk , ggk , llk , f3k, &
 !$omp                 aaki , wwk , qqk ,      &
 !$omp              ddkaaki , vvk , uuk ,      &
 !$omp                 mmmk , fffk,            &
 !$omp                  ie , b_err , pos_shift_ie , &
 !$omp                  isl , pos_shift_isl , igdll , &
 !$omp                  pos_shift_igdll , is, i , il , &
 !$omp                  jsl , pos_shift_jsl , jgdll , &
 !$omp                  js, j , jl , pos,      &
 !$omp                  message)&
 !$omp           shared(base,grid,bcs,sdim,mmmi,mmmj,mmmx,fff, &
 !$omp                  taus)   &
 !$omp          default(none)

 dp = base%me%d+1
 mk = base%mk
 pk = base%pk
 nk = base%nk
 allocate( aak(   mk,mk) , bbk(   mk,pk) , cck(   mk,dp*nk) ,            &
           ddk(   pk,mk) , eek(   pk,pk) , hhk(   pk,dp*nk) , f2k(   pk),&
           ffk(dp*nk,mk) , ggk(dp*nk,pk) , llk(dp*nk,dp*nk) , f3k(dp*nk),&
           aaki(  mk,mk) , wwk(   mk,pk) , qqk(   mk,dp*nk) ,            &
          ddkaaki(pk,mk) , vvk(   pk,pk) , uuk(   pk,dp*nk) ,            &
          mmmk(dp*nk,dp*nk), fffk(dp*nk) )
 !$omp single
  sdim = grid%ns*nk ! problem dimension
  allocate( mmmi(grid%ne*(dp*nk)**2) , mmmj(grid%ne*(dp*nk)**2) )
  allocate( mmmx(grid%ne*(dp*nk)**2) , fff(sdim) )
  fff = 0.0_wp
 !$omp end single

 !$omp do schedule(static)
 elem_loop: do ie=1,grid%ne

   !---------------------------------------------------------------------
   !1) compute local matrices
   call ldgh_locmat( aak , bbk , cck ,      &
                     ddk , eek , hhk , f2k, &
                     ffk , ggk , llk , f3k, &
                     base, grid%e(ie), taus(ie), &
                     bcs%b_e2be(ie), b_err )

   if(b_err%lerr) &
     call error(this_prog_name,'',b_err%message)
   !---------------------------------------------------------------------

   !---------------------------------------------------------------------
   !2) static condensation

   ! aaki = inv(aak)
   call invmat_chol( aak , aaki )
   ! ddkaaki = ddk*aaki
   ddkaaki = matmul(ddk,aaki)

   ! vvk  = inv(eek - ddk*aaki*bbk)
   call invmat_nop( eek - matmul(ddkaaki,bbk) , vvk )
   ! uuk  = vvk*(ddk*aaki*cck - hhk)
   uuk = matmul( vvk , matmul(ddkaaki,cck)-hhk )
    
   ! qqk  = -aaki*(bbk*uuk + cck)
   qqk = -matmul( aaki , matmul(bbk,uuk)+cck )
   ! wwk  = -aaki*bbk*vvk
   wwk = -matmul(aaki,matmul(bbk,vvk))
    
   ! mmmk = ffk*qqk + ggk*uuk + llk
   mmmk = matmul(ffk,qqk) + matmul(ggk,uuk) + llk
   ! fffk = f3k - (ffk*wwk + ggk*vvk)*f2k
   fffk = f3k - matmul( matmul(ffk,wwk)+matmul(ggk,vvk) , f2k)
   !---------------------------------------------------------------------

   !---------------------------------------------------------------------
   !3.1) side oriented summation of the element contributions
   ! The summation is done in two steps, exploting the operations
   ! defined for sparse matrices:
   ! a) the contributions of each element are collected into three
   !  vectors: indices and value, without doing any summation of the
   !  contributions of different elements;
   ! b) the sparse matrix is constructed and reduced: the reduction
   !  operation sums the element contributions.
   ! Notice that step b) has to be done outside the element loop.
   pos_shift_ie = (ie-1)*(dp*nk)**2
   iside_do: do isl=1,dp
     pos_shift_isl = pos_shift_ie + (isl-1)*nk*dp*nk
     igdl_do: do igdll=1,nk
       pos_shift_igdll = pos_shift_isl + (igdll-1)*dp*nk
       ! global side index
       is = grid%e(ie)%is(isl)
       ! matrix index i
       i  =  (is-1)*nk + igdll
       ! local matrix index i
       il = (isl-1)*nk + igdll
       jside_do: do jsl=1,dp
         pos_shift_jsl = pos_shift_igdll + (jsl-1)*nk
         jgdl_do: do jgdll=1,nk
           ! global side index
           js = grid%e(ie)%is(jsl)
           ! matrix index i
           j  =  (js-1)*nk + jgdll
           ! local matrix index i
           jl = (jsl-1)*nk + jgdll

           ! store the values in the unreduced matrix
           pos = pos_shift_jsl + jgdll
           mmmi(pos) = i
           mmmj(pos) = j
           mmmx(pos) = mmmk(il,jl)

         enddo jgdl_do
       enddo jside_do

       ! right hand side
       !$omp atomic
        fff(i) = fff(i) + fffk(il)

     enddo igdl_do
   enddo iside_do
   !---------------------------------------------------------------------

 enddo elem_loop
 !$omp end do nowait

 deallocate( aak , bbk , cck ,      &
             ddk , eek , hhk , f2k, &
             ffk , ggk , llk , f3k, &
            aaki , wwk , qqk ,      &
         ddkaaki , vvk , uuk ,      &
            mmmk , fffk )

 !$omp end parallel
 t1 = my_second()
 write(message(1),elapsed_format) &
   'Completed computation of the local matrices: elapsed time ',t1-t0,'s.'
 call info(this_prog_name,'',message(1))
 !$ if(detailed_timing_omp) then
 !$   call omput_write_time()
 !$   call omput_close_timer()
 !$   call omput_pop_key()
 !$ endif

 !-----------------------------------------------------------------------
 !3.2) complete the side oriented summation of the element contributions
 ! notice that indexes in sparse matrices start from 0
 mmm = tri2col(new_tri(sdim,sdim,mmmi-1,mmmj-1,mmmx))
 deallocate( mmmi , mmmj , mmmx )

 t1 = my_second()
 write(message(1),elapsed_format) &
   'Completed assembling of the linear system: elapsed time ',t1-t0,'s.'
 call info(this_prog_name,'',message(1))
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 !4) The fracture terms now must be included

 t0 = my_second()
 !$ if(detailed_timing_omp) then
 !$   call omput_push_key("LocalFractureMatrices")
 !$   call omput_start_timer()
 !$ endif

 !$omp parallel private(mk , pk ,        &
 !$omp                 aak , bbk , f1k , &
 !$omp                 cck , ddk , f2k , &
 !$omp                aaki ,             &
 !$omp  ie , i , j , pos_shift_ie , pos , ii , &
 !$omp  is , je , pos_shift_is, b_sub_err,     &
 !$omp                  message)         &
 !$omp           shared(sub_base,sub_grid,sub_bcs,sdim, &
 !$omp  f_mai,f_maj,f_max,f_mbi,f_mbj,f_mbx,f_1, &
 !$omp  f_mci,f_mcj,f_mcx,f_mdi,f_mdj,f_mdx,f_2) &
 !$omp          default(none)

 mk = sub_base%mk
 pk = sub_base%pk
 allocate( aak(mk,mk) , bbk(mk,pk) ,         &
           cck(pk,mk) , ddk(pk,pk) , f2k(pk) )
 allocate(aaki(mk,mk) )
 !$omp single
  sdim = sub_grid%ne*mk**2
  allocate( f_mai(sdim) , f_maj(sdim) , f_max(sdim) )
  sdim = (sub_grid%ne + sub_grid%ni*4 + sub_grid%nb)*mk*pk
  allocate( f_mbi(sdim) , f_mbj(sdim) , f_mbx(sdim) )
  sdim = sub_grid%ne*mk; allocate( f_1(sdim) ); f_1 = 0.0_wp
  sdim = (sub_grid%ne + sub_grid%ni*4 + sub_grid%nb)*pk*mk
  allocate( f_mci(sdim) , f_mcj(sdim) , f_mcx(sdim) )
  sdim = (sub_grid%ne + sub_grid%ni*4 + sub_grid%nb)*pk**2
  allocate( f_mdi(sdim) , f_mdj(sdim) , f_mdx(sdim) )
  sdim = sub_grid%ne*pk; allocate( f_2(sdim) ); f_2 = 0.0_wp
 !$omp end single

 !$omp do schedule(static)
 f_elem_loop: do ie=1,sub_grid%ne

   call ldg_locmat_e( aak , bbk ,              &
                      cck , ddk , f2k ,        &
                      sub_base, sub_grid%e(ie) )

   call invmat_chol( aak , aaki )
   pos_shift_ie = (ie-1)*mk**2
   do i=1,mk; do j=1,mk
     pos = pos_shift_ie + (i-1)*mk + j
     f_mai(pos) = (ie-1)*mk + i
     f_maj(pos) = (ie-1)*mk + j
     f_max(pos) = aaki(i,j)
   enddo; enddo

   pos_shift_ie = (ie-1)*mk*pk
   do i=1,mk; do j=1,pk
     pos = pos_shift_ie + (i-1)*pk + j
     f_mbi(pos) = (ie-1)*mk + i
     f_mbj(pos) = (ie-1)*pk + j
     f_mbx(pos) = bbk(i,j)
   enddo; enddo

   pos_shift_ie = (ie-1)*pk*mk
   do i=1,pk; do j=1,mk
     pos = pos_shift_ie + (i-1)*mk + j
     f_mci(pos) = (ie-1)*pk + i
     f_mcj(pos) = (ie-1)*mk + j
     f_mcx(pos) = cck(i,j)
   enddo; enddo

   pos_shift_ie = (ie-1)*pk**2
   do i=1,pk; do j=1,pk
     pos = pos_shift_ie + (i-1)*pk + j
     f_mdi(pos) = (ie-1)*pk + i
     f_mdj(pos) = (ie-1)*pk + j
     f_mdx(pos) = ddk(i,j)
   enddo; enddo
   do i=1,pk
     ii = (ie-1)*pk + i
     !$omp atomic
      f_2(ii) = f_2(ii) + f2k(i)
   enddo

 enddo f_elem_loop
 !$omp end do nowait
 deallocate( aak , bbk ,     &
             cck , ddk , f2k )
 deallocate(aaki )

 allocate(                  bbk(2*mk,2*pk) , &
           cck(2*pk,2*mk) , ddk(2*pk,2*pk) )
 !$omp do schedule(static)
 f_isid_loop: do is=1,sub_grid%ni

   call ldg_locmat_s(       bbk ,              &
                      cck , ddk ,              &
                      sub_base, sub_grid%s(is) )

   pos_shift_is = sub_grid%ne*mk*pk + (is-1)*4*mk*pk
   do ie=1,2; do i=1,mk; do je=1,2; do j=1,pk
     pos = pos_shift_is + ((( ie-1 )*mk + i-1 )*2 + je-1 )*pk + j
     f_mbi(pos) = (sub_grid%s(is)%ie(ie)-1)*mk + i
     f_mbj(pos) = (sub_grid%s(is)%ie(je)-1)*pk + j
     f_mbx(pos) = bbk((ie-1)*mk+i,(je-1)*pk+j)
   enddo; enddo; enddo; enddo

   pos_shift_is = sub_grid%ne*pk*mk + (is-1)*4*pk*mk
   do ie=1,2; do i=1,pk; do je=1,2; do j=1,mk
     pos = pos_shift_is + ((( ie-1 )*pk + i-1 )*2 + je-1 )*mk + j
     f_mci(pos) = (sub_grid%s(is)%ie(ie)-1)*pk + i
     f_mcj(pos) = (sub_grid%s(is)%ie(je)-1)*mk + j
     f_mcx(pos) = cck((ie-1)*pk+i,(je-1)*mk+j)
   enddo; enddo; enddo; enddo

   pos_shift_is = sub_grid%ne*pk**2 + (is-1)*4*pk**2
   do ie=1,2; do i=1,pk; do je=1,2; do j=1,pk
     pos = pos_shift_is + ((( ie-1 )*pk + i-1 )*2 + je-1 )*pk + j
     f_mdi(pos) = (sub_grid%s(is)%ie(ie)-1)*pk + i
     f_mdj(pos) = (sub_grid%s(is)%ie(je)-1)*pk + j
     f_mdx(pos) = ddk((ie-1)*pk+i,(je-1)*pk+j)
   enddo; enddo; enddo; enddo

 enddo f_isid_loop
 !$omp end do nowait
 deallocate(       bbk , &
             cck , ddk )

 allocate(              bbk(mk,pk) , f1k(mk) , &
           cck(pk,mk) , ddk(pk,pk) , f2k(pk) )
 !$omp do schedule(static)
 f_bsid_loop: do is=sub_grid%ni+1,sub_grid%ns

   call ldg_locmat_bs(       bbk , f1k ,        &
                       cck , ddk , f2k ,        &
                      sub_base,sub_grid%s(is),  &
                      sub_bcs%b_s2bs(is)%p,b_sub_err)

   pos_shift_is = sub_grid%ne*mk*pk + sub_grid%ni*4*mk*pk &
                 + (is-sub_grid%ni-1)*mk*pk
   do i=1,mk; do j=1,pk
     pos = pos_shift_is + (i-1)*pk + j
     f_mbi(pos) = (sub_grid%s(is)%ie(1)-1)*mk + i
     f_mbj(pos) = (sub_grid%s(is)%ie(1)-1)*pk + j
     f_mbx(pos) = bbk(i,j)
   enddo; enddo
   do i=1,mk
     ii = (sub_grid%s(is)%ie(1)-1)*mk + i
     !$omp atomic
      f_1(ii) = f_1(ii) + f1k(i)
   enddo

   pos_shift_is = sub_grid%ne*pk*mk + sub_grid%ni*4*pk*mk &
                 + (is-sub_grid%ni-1)*pk*mk
   do i=1,pk; do j=1,mk
     pos = pos_shift_is + (i-1)*mk + j
     f_mci(pos) = (sub_grid%s(is)%ie(1)-1)*pk + i
     f_mcj(pos) = (sub_grid%s(is)%ie(1)-1)*mk + j
     f_mcx(pos) = cck(i,j)
   enddo; enddo

   pos_shift_is = sub_grid%ne*pk**2 + sub_grid%ni*4*pk**2 &
                 + (is-sub_grid%ni-1)*pk**2
   do i=1,pk; do j=1,pk
     pos = pos_shift_is + (i-1)*pk + j
     f_mdi(pos) = (sub_grid%s(is)%ie(1)-1)*pk + i
     f_mdj(pos) = (sub_grid%s(is)%ie(1)-1)*pk + j
     f_mdx(pos) = ddk(i,j)
   enddo; enddo
   do i=1,pk
     ii = (sub_grid%s(is)%ie(1)-1)*pk + i
     !$omp atomic
      f_2(ii) = f_2(ii) + f2k(i)
   enddo

 enddo f_bsid_loop
 !$omp end do nowait
 deallocate(       bbk , f1k , &
             cck , ddk , f2k )
 !$omp end parallel
 t1 = my_second()
 write(message(1),elapsed_format) &
 'Completed computation of the local fracture matrices: elapsed time ',&
   t1-t0,'s.'
 call info(this_prog_name,'',message(1))
 !$ if(detailed_timing_omp) then
 !$   call omput_write_time()
 !$   call omput_close_timer()
 !$   call omput_pop_key()
 !$ endif

 ii = sub_grid%ne*sub_base%mk
 jj = sub_grid%ne*sub_base%pk
 f_ma = tri2col(new_tri(ii,ii,f_mai-1,f_maj-1,f_max))
 f_mb = tri2col(new_tri(ii,jj,f_mbi-1,f_mbj-1,f_mbx))
 f_mc = tri2col(new_tri(jj,ii,f_mci-1,f_mcj-1,f_mcx))
 f_md = tri2col(new_tri(jj,jj,f_mdi-1,f_mdj-1,f_mdx))
 deallocate( f_mai , f_maj , f_max )
 deallocate( f_mbi , f_mbj , f_mbx )
 deallocate( f_mci , f_mcj , f_mcx )
 deallocate( f_mdi , f_mdj , f_mdx )
 f_mc = (-1.0_wp)*matmul(f_mc,f_ma)
 f_md = f_md + matmul(f_mc,f_mb)
 f_2  = f_2 + matmul(f_mc,f_1)
 call clear( f_ma ); call clear( f_mb ); call clear( f_mc )
 deallocate( f_1 )

 ! Now the fracture terms must be added to the main matrix
 allocate(dofs_remap(sub_grid%ne*sub_base%pk))
 do ie=1,sub_grid%ne
   do i=1,sub_base%pk
     dofs_remap((ie-1)*sub_base%pk+i) = (sub_e2s(ie)-1)*base%nk + i
   enddo
 enddo
 f_md_g = col2tri(f_md)
 f_md_g%n = mmm%n
 f_md_g%m = mmm%m
 do i=1,f_md_g%nz
   f_md_g%ti(i) = dofs_remap(f_md_g%ti(i)+1)-1
   f_md_g%tj(i) = dofs_remap(f_md_g%tj(i)+1)-1
 enddo
 mmm = mmm + ((-1.0_wp)*tri2col(f_md_g))
 do i=1,size(dofs_remap)
   fff(dofs_remap(i)) = fff(dofs_remap(i)) - f_2(i)
 enddo
 deallocate(dofs_remap)
 call clear(f_md_g)
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! Partition mmm and the rhs according to the Dirichlet bcs
 t0 = my_second()
 allocate(dofs_dir( base%nk * count(bcs%b_side%bc.eq.b_dir) ))
 allocate(lam2( size(dofs_dir) ))
 allocate(dofs_nat( base%nk*(grid%ns) - size(dofs_dir) ))
 is_dir = 0
 is_int = 0
 do is=1,grid%ni
   is_int = is_int+1
   dofs_nat( (is_int-1)*base%nk+1 : is_int*base%nk ) = &
      (/ (igdll, igdll=(is-1)*base%nk,is*base%nk-1) /)
 enddo
 allocate(norm_e2(base%nk)) ! squared norm of the eta basis functions
 do igdll=1,base%nk
   norm_e2(igdll) = sum(base%wgs * base%e(igdll,:)**2)
 enddo
 do is=grid%ni+1,grid%ns
   if(bcs%b_s2bs(is)%p%bc.eq.b_dir) then ! Dirichlet side
     is_dir = is_dir+1
     dofs_dir( (is_dir-1)*base%nk+1 : is_dir*base%nk ) = &
        (/ (igdll, igdll=(is-1)*base%nk,is*base%nk-1) /)
     do igdll=1,base%nk ! projections (the eta must be orthogonal)
       lam2( (is_dir-1)*base%nk+igdll ) = &
          sum( base%wgs                                          * &
               coeff_dir(affmap(grid%s(is)%e(1)%p,                 &
                                base%xigb(:,:,grid%s(is)%isl(1))), &
                         -grid%s(is)%ie(2))                      * &
               base%e(igdll,:) )/norm_e2(igdll)
     enddo
   else
     is_int = is_int+1
     dofs_nat( (is_int-1)*base%nk+1 : is_int*base%nk ) = &
        (/ (igdll, igdll=(is-1)*base%nk,is*base%nk-1) /)
   endif
 enddo
 deallocate(norm_e2)

 !         row             column
 mmm11 = dofs_nat * (mmm * dofs_nat)
 mmm12 = dofs_nat * (mmm * dofs_dir)
 mmm21 = dofs_dir * (mmm * dofs_nat)
 mmm22 = dofs_dir * (mmm * dofs_dir)

 allocate(fff1(size(dofs_nat)))
 fff1 = fff(dofs_nat+1)

 t1 = my_second()
 write(message(1),elapsed_format) &
   'Completed partitioning of the linear system: elapsed time ',t1-t0,'s.'
 call info(this_prog_name,'',message(1))
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! Solution of the linear system
 t0 = my_second()

 allocate(lam1(mmm11%n),rhs(mmm11%n))
 rhs = fff1 - matmul(mmm12,lam2)

 endif mpi_if_1

 ! When the local solver is mumps, all the processes call the solver
 ! routines, otherwise only the master process calls them.
 mpi_if_2: if( (mpi_id.eq.0).or.(linsolver.eq.'mumps') ) then
   call factor(mmm11)
   call solve (mmm11,lam1,rhs)
   call clean ()
 endif mpi_if_2

 mpi_if_3: if(mpi_id.eq.0) then
 deallocate(rhs)

 ! boundary terms
 allocate(fff2(mmm22%n))
 fff2 = matmul(mmm21,lam1) + matmul(mmm22,lam2)

 ! pack back the values
 allocate(lam(size(lam1)+size(lam2)))
 lam(dofs_nat+1) = lam1
 lam(dofs_dir+1) = lam2
 fff(dofs_dir+1) = fff2

 t1 = my_second()
 write(message(1),elapsed_format) &
   'Completed solution of the linear system: elapsed time ',t1-t0,'s.'
 call info(this_prog_name,'',message(1))
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! Clear the variables used in the solution of the linear system
 nnz_mmm11 = mmm11%nz
 call clear( mmm11 )
 call clear( mmm12 )
 call clear( mmm21 )
 call clear( mmm22 )
 deallocate( fff1 , fff2 )
 deallocate( lam1 , lam2 )
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! Postprocessings
! call mod_ldgh_reconstructions_constructor(base,lam,compute_usef)
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! Error computation
! t0 = my_second()
!
! errnorms: if(compute_error_norms) then
!   call mod_error_norms_constructor()
!   
!   call hybrid_err(base,lam,coeff_lam,err_extra_deg, &
!                   e_l2s,e_l2k,e_pl2s,e_pl2k)
!   call primal_err(base,uuu,coeff_lam,err_extra_deg, &
!                   e_u_l2)
!   call primal_err(bases,uuus,coeff_lam,err_extra_deg, &
!                   e_us_l2)
!   if(compute_usef) &
!     call primal_err_sef(bases,uuusef,coeff_xiadv,xiadv_el, &
!                         coeff_lam,err_extra_deg,e_usef_l2)
!   call dual_err(base,qqq,coeff_q,coeff_divq,err_extra_deg, &
!                 e_q_l2,e_q_hdiv)
!!   if(base%r.gt.0) &
!!     call dual_err(base_qsdg,qqqsdg,coeff_q,coeff_divq,err_extra_deg, &
!!                   e_qsdg_l2,e_qsdg_hdiv)
!   call dual_err(base_qsrt,qqqsrt,coeff_q,coeff_divq,err_extra_deg, &
!                 e_qsrt_l2,e_qsrt_hdiv)
!
!   call mod_error_norms_destructor()
! endif errnorms
!
! t1 = my_second()
! write(message(1),elapsed_format) &
!   'Completed error computations: elapsed time ',t1-t0,'s.'
! call info(this_prog_name,'',message(1))
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! Write the results
 call new_file_unit(fu,ierr)
 out_file_results_name = trim(in_basename)//out_file_results_suff
 open(fu,file=trim(out_file_results_name), &
      status='replace',action='write',form='formatted',iostat=ierr)
   ! preamble
   call write_octave(test_name       ,'test_name'       ,fu)
   call write_octave(test_description,'test_description',fu)
   call write_octave(nnz_mmm11       ,'nnz_mmm11'       ,fu)
   ! data
   if(write_sys) then
     call write_octave(dofs_nat,'c','dofs_nat',fu)
     call write_octave(dofs_dir,'c','dofs_dir',fu)
     call write_octave(mmm,'mmm',fu)
     call write_octave(fff,'c','fff',fu)
     call write_octave(f_md,'f_m',fu)
     call write_octave(f_2,'c','f_f',fu)
   endif
   call write_octave(lam,'c','lam',fu)
!   call write_octave(uuu,'c','uuu',fu)
!   call write_octave(qqq,'c','qqq',fu)
!   call write_octave(qhn,'qhn',fu)
!   call write_octave(bases,'bases',fu)
!   call write_octave(uuus,'c','uuus',fu)
!   if(compute_usef) then
!     call write_octave(uuusef,'c','uuusef',fu)
!     call write_octave(xiadv_el,'c','xiadv_el',fu)
!   endif
   !> \bug add reconstructions and error computations
!   if(base%k.gt.0) then
!     call write_octave(base_qsdg,'base_qsdg',fu)
!     call write_octave(qqqsdg,'c','qqqsdg',fu)
!   endif
!   call write_octave(base_qsrt,'base_qsrt',fu)
!   call write_octave(qqqsrt,'c','qqqsrt',fu)
!   if(compute_error_norms) then
!     call write_octave(e_l2s ,'e_l2s' ,fu)
!     call write_octave(e_l2k ,'e_l2k' ,fu)
!     call write_octave(e_pl2s,'e_pl2s',fu)
!     call write_octave(e_pl2k,'e_pl2k',fu)
!     call write_octave(e_u_l2,'e_u_l2',fu)
!     call write_octave(e_us_l2,'e_us_l2',fu)
!     if(compute_usef) &
!       call write_octave(e_usef_l2,'e_usef_l2',fu)
!     call write_octave(e_q_l2,'e_q_l2',fu)
!     call write_octave(e_q_hdiv,'e_q_hdiv',fu)
!!     if(base%k.gt.0) then
!!       call write_octave(e_qsdg_l2,'e_qsdg_l2',fu)
!!       call write_octave(e_qsdg_hdiv,'e_qsdg_hdiv',fu)
!!     endif
!     call write_octave(e_qsrt_l2,'e_qsrt_l2',fu)
!     call write_octave(e_qsrt_hdiv,'e_qsrt_hdiv',fu)
!   endif
 close(fu,iostat=ierr)
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! Cleanup
! call mod_ldgh_reconstructions_destructor()

 deallocate( f_2 )
 deallocate( dofs_nat , dofs_dir , fff , lam )
 call clear( f_md )
 call clear( mmm )

 call mod_ldg_locmat_destructor()
 call mod_ldgh_locmat_destructor()

 call mod_tau_destructor()

 call mod_testcases_destructor()

 call clear( sub_bcs )
 call clear( bcs )
 call mod_bcs_destructor()

 call clear( base )
 call mod_fe_spaces_destructor()

 call mod_base_destructor()

 call mod_numquad_destructor()

 call clear( sub_grid ); deallocate( sub_v2v , sub_e2s )
 call clear( grid )
 call mod_grid_destructor()

 call mod_master_el_destructor()

 call mod_output_control_destructor()

 endif mpi_if_3

 !-----------------------------------------------------------------------
 ! Cleanup of all the MPI processes
 !-----------------------------------------------------------------------

 call mod_linsolver_destructor()

 deallocate( bc_type_t )

 call mod_octave_io_sparse_destructor()
 call mod_sparse_destructor()

 call mod_octave_io_perms_destructor()
 call mod_perms_destructor()

 call mod_linal_destructor()

 call mod_octave_io_sympoly_destructor()
 call mod_sympoly_destructor()

 call mod_octave_io_destructor()

 call mod_fu_manager_destructor()

 call mod_mpi_utils_destructor()
 call mpi_finalize(ierr)

 call mod_kinds_destructor()

 t1 = my_second()
 write(message(1),elapsed_format) &
   'Done: elapsed time ',t1-t00,'s.'
 call info(this_prog_name,'',message(1))

 !$ if(detailed_timing_omp) call mod_omp_utils_destructor()

 call mod_messages_destructor()
 !-----------------------------------------------------------------------

end program ldghf_main

